Scenario 1: Random Scatter
To begin, pursue the point of view that structure in the data is indicated by departures from a uniform scatter of palindromes across the DNA.
Of course, a random uniform scatter does that mean that palindromes will be equally spaced as milestones on a freeway. There will be some gaps on the DNA where no palindromes occur, and there will be some clumping together of palindromes.
To look for structure examine the locations of the palindromes, the spacing between palindromes, and the counts of palindromes in non overlapping regions of the DNA. One starting place might be to see first how random scatter looks by using a computer to simulate it.
A computer can simulate 296 palindrome sites chosen at random along a DNA sequence of 229,354 bases using a pseudo random number generator. When this is done several times, by making seller sets of simulated palindrome locations, then the real data can be compared to the simulated data.
set.seed(0)
color <- 'red'
# Generate 3 samples from the uniform distribution with the same size and bounds as our data.
samples = list(sort(runif(n, min=0, max=N)), sort(runif(n, min=0, max=N)), sort(runif(n, min=0, max=N)))
# Dot plot of locations of palindromes in original data and uniform scatter.
title1 <- 'Locations of Palindromes'
title2 <- c(title1,'(Simulated)')
x.axis <- 'Base Pair'
symbol <- 3
stripchart(locations, pch=symbol, col=color, main=title1, xlab=x.axis)

for (sample in samples) {
stripchart(sample, pch=symbol, main=title2, xlab=x.axis)
}



# Additional dot plot of locations of palindromes in original data and uniform scatter.
dotchart(locations, color=color, main=title1, xlab=x.axis)

for (sample in samples) {
dotchart(sample, main=title2, xlab=x.axis)
}



# Histogram of locations of palindromes in original data and uniform scatter.
bins <- 35
hist(locations, col=color, breaks=bins, main=title1, xlab=x.axis)

for (sample in samples) {
hist(sample, breaks=bins, main=title2, xlab=x.axis)
}



# Scatterplot of spacing between consecutive palindromes
title1 <- 'Spacing between Consecutive Palindromes'
title2 <- c(title1,'(Simulated)')
x.axis <- 'Base Pair Location'
y.axis <- 'Distance (Base Pairs) from Previous Palindrome'
y.range <- c(0,5000)
plot(locations[-1], diff(locations), col=color, ylim=y.range, main=title1, xlab=x.axis, ylab=y.axis)

for (sample in samples) {
plot(sample[-1], diff(sample), ylim=y.range, main=title2, xlab=x.axis, ylab=y.axis)
}



# Histogram of counts of palindromes in non-overlapping regions in original data and uniform scatter.
interval.length <- 2500
title1 <- paste('Number of Palindromes in Non-Overlapping Regions of Length', interval.length)
title2 <- c(title1,'(Simulated)')
x.axis <- 'Number of Palindromes'
bins <- seq(0,20,1)
hist(as.vector(table(cut(locations, breaks=seq(0,N,interval.length), include.lowest = TRUE))), breaks=bins, col=color, main=title1, xlab=x.axis)

for (sample in samples) {
hist(as.vector(table(cut(sample, breaks=seq(0,N,interval.length), include.lowest = TRUE))), breaks=bins, main=title2, xlab=x.axis)
}



Scenario 2: Locations and Spacings
Use graphical methods to examine the spacings between consecutive palindromes and sum of consecutive pairs, triplets, etc, spacings. Compare what you find to what you would expect to find in a random scatter. Also, use graphical methods to compare locations of the palindromes.
Locations We performed the tests three times over three different lengths of sub-intervals.
# Histogram of locations of palindromes in original data and uniform scatter
uniform <- sample.int(N, size = n)
hist(locations, breaks = 20, probability = TRUE, col = rgb(1,0,0,0.5), main = "Location Data Distribution Comparison", xlab= "Palindrome Locations")
lines(density(locations, adjust = 2), col = 2)
hist(uniform,breaks = 20, probability = TRUE, col = rgb(0,0,1,0.5), add = TRUE)
lines(density(uniform, adjust = 2), col = 4)
legend(x = 180000, y = 0.000008, legend = c("Sample", "Uniform"), lty = c(1,1), col = c(rgb(1,0,0,0.5), rgb(0,0,1,0.5)))

# Chi-square Goodness of Fit Test
# Case 1: k(number of sub-intervals) = 21
k <- 20
locations.expected <- n/k
tab <- table(cut(locations, breaks = seq(0, N, length.out = k+1), include.lowest = TRUE))
locations.observed <- as.vector(tab)
chi_2 <- sum((locations.observed - locations.expected)^2/locations.expected)
chi2_compare <- qchisq(p = 0.95, df = 19)
p_value <- pchisq(chi_2, df = 19, lower.tail = FALSE)
print(cat("\nWhen conducting chi_square Goodness of fit test comparing locations(divided in 20 sub-intervals) against uniform distribution\n"))
When conducting chi_square Goodness of fit test comparing locations(divided in 20 sub-intervals) against uniform distribution
NULL
print(paste("The value of chi_square statistic is", chi_2))
[1] "The value of chi_square statistic is 17.9189189189189"
print(paste("The p_value is", p_value))
[1] "The p_value is 0.527860332119311"
## Visualization of the Residual
Residuals <- (locations.observed - locations.expected) / sqrt(locations.expected)
plot(Residuals, type = 'h', ylab = "Standardized Residuals", xlab = "Palindrome locations", main = "Plot of Standardized Residual for Locations (divided in 20 sub-intervals)")

# Case 2: k(number of sub-intervals) = 100
k <- 30
locations.expected <- n/k
tab <- table(cut(locations, breaks = seq(0, N, length.out = k+1), include.lowest = TRUE))
locations.observed <- as.vector(tab)
chi_2 <- sum((locations.observed - locations.expected)^2/locations.expected)
chi2_compare <- qchisq(p = 0.95, df = 29)
p_value <- pchisq(chi_2, df = 29, lower.tail = FALSE)
print(cat("\nWhen conducting chi_square Goodness of fit test comparing locations(divided in 30 sub-intervals) against uniform distribution\n"))
When conducting chi_square Goodness of fit test comparing locations(divided in 30 sub-intervals) against uniform distribution
NULL
print(paste("The value of chi_square statistic is", chi_2))
[1] "The value of chi_square statistic is 40.6891891891892"
print(paste("The p_value is", p_value))
[1] "The p_value is 0.0732835870345071"
## Visualization of the Residual
Residuals <- (locations.observed - locations.expected) / sqrt(locations.expected)
plot(Residuals, type = 'h', ylab = "Standardized Residuals", xlab = "Palindrome locations", main = "Plot of Standardized Residual for Locations (divided in 30 sub-intervals)")

# Case 3: k(number of sub-intervals) = 500
k <- 60
locations.expected <- n/k
tab <- table(cut(locations, breaks = seq(0, N, length.out = k+1), include.lowest = TRUE))
locations.observed <- as.vector(tab)
chi_2 <- sum((locations.observed - locations.expected)^2/locations.expected)
chi2_compare <- qchisq(p = 0.95, df = 59)
p_value <- pchisq(chi_2, df = 59, lower.tail = FALSE)
print(cat("\nWhen conducting chi_square Goodness of fit test comparing locations(divided in 60 sub-intervals) against uniform distribution\n"))
When conducting chi_square Goodness of fit test comparing locations(divided in 60 sub-intervals) against uniform distribution
NULL
print(paste("The value of chi_square statistic is", chi_2))
[1] "The value of chi_square statistic is 79"
print(paste("The p_value is", p_value))
[1] "The p_value is 0.0421403871302519"
## Visualization of the Residual
Residuals <- (locations.observed - locations.expected) / sqrt(locations.expected)
plot(Residuals, type = 'h', ylab = "Standardized Residuals", xlab = "Palindrome locations", main = "Plot of Standardized Residual for Locations (divided in 60 sub-intervals)")

Inference: Null Hypothesis: Locations are distributed uniformly. Conclusion: Since p-value of this chi-square test is smaller than 0.05, we fail to reject the null hypothesis. It indicates that deviations as large as ours are not so likely. Hence, we conclude that it appears that uniform is not a reasonable initial model. Residual Conclusion: Since the value of the standardized residual is larger than 3, the probability model of a uniform distribution is lack of fit.
Spacings We performed the tests three times over three different lengths of sub-intervals.
# Consecutive Pairs
locations.sorted = sort(locations, decreasing = FALSE)
distance.pair <- abs(locations.sorted[-1]-locations.sorted[-length(locations.sorted)])
# Histogram of spacings of palindromes in original data and exponential distribution
hist(distance.pair, breaks= 15, col = rgb(1,0,0,0.5), probability = TRUE, main = "Consecutive Pairs Spacings Distribution Comparison", xlab = "Distance between Consecutive Palindromes Locations", ylim = c(0,0.001))
lines(density(distance.pair, adjust = 2), col = rgb(1,0,0,0.5))
Expo <- rexp(n-1, rate = 1/mean(distance.pair))
hist(Expo, breaks = 15, col = rgb(0,0,1,0.5), probability = TRUE, add = TRUE)
lines(density(Expo, adjust = 2), col = rgb(0,0,1,0.5))
legend(x = 4200, y = 0.0009, legend = c("Sample", "Exponential"), lty = c(1,1), col = c(rgb(1,0,0,0.5), rgb(0,0,1,0.5)))

# Chi-square Goodness of Fit Test
# Construct observed numbers of intervals
# Case 1: Divided in 7 intervals
# Construct expected number of intervals
spacings.observed <- sort(distance.pair, decreasing = FALSE)
lambda <- 1/mean(distance.pair)
spacings.intervals <- as.numeric(quantile(spacings.observed, probs = c(0,0.05, 0.1, 0.3, 0.5, 0.7, 0.9,1)))
spacings.expected <- (n-1)*(exp(-lambda*spacings.intervals[-length(spacings.intervals)])-exp(-lambda*spacings.intervals[-1]))
spacings.expected[length(spacings.expected)] <- n-sum(spacings.expected[1:length(spacings.expected)-1])
spacings.observed <- as.numeric(table(cut(distance.pair, breaks = spacings.intervals, include.lowest = TRUE)))
contingency_7 <- data.frame(spacings.intervals[-1],spacings.observed,spacings.expected)
contingency_7
chi_2 <- sum((spacings.observed - spacings.expected)^2/spacings.expected)
chi2_compare <- qchisq(p = 0.95, df = 5)
p_value <- pchisq(chi_2, df = 5, lower.tail = FALSE)
print(paste("The p_value when the distance is splited into 7 sub-intervals is", p_value))
[1] "The p_value when the distance is splited into 7 sub-intervals is 6.42447875347949e-05"
## Visualization of the Residual
Residuals <- (spacings.observed - spacings.expected) / sqrt(spacings.expected)
plot(Residuals, type = 'h', ylab = "Standardized Residuals", xlab = "Palindrome locations", main = "Plot of Standardized Residual for Locations (divided in 7 bins)")

# Case 2: Divided in 10 intervals
# Construct expected number of intervals
spacings.observed <- sort(distance.pair, decreasing = FALSE)
spacings.intervals <- as.numeric(quantile(spacings.observed, probs = c(seq(0,1, by = 0.1))))
spacings.expected <- (n-1)*(exp(-lambda*spacings.intervals[-length(spacings.intervals)])-exp(-lambda*spacings.intervals[-1]))
spacings.expected[length(spacings.expected)] <- n-sum(spacings.expected[1:length(spacings.expected)-1])
spacings.observed <- as.numeric(table(cut(distance.pair, breaks = spacings.intervals, include.lowest = TRUE)))
contingency_10 <- data.frame(spacings.intervals[-1],spacings.observed,spacings.expected)
contingency_10
chi_2 <- sum((spacings.observed - spacings.expected)^2/spacings.expected)
chi2_compare <- qchisq(p = 0.95, df = 8)
p_value <- pchisq(chi_2, df = 8, lower.tail = FALSE)
print(paste("The p_value when the distance is splited into 10 sub-intervals is", p_value))
[1] "The p_value when the distance is splited into 10 sub-intervals is 0.000218981752365422"
## Visualization of the Residual
Residuals <- (spacings.observed - spacings.expected) / sqrt(spacings.expected)
plot(Residuals, type = 'h', ylab = "Standardized Residuals", xlab = "Palindrome locations", main = "Plot of Standardized Residual for Locations (divided in 10 bins)")

# Case 3: Divided in 20 intervals
# Construct expected number of intervals
spacings.observed <- sort(distance.pair, decreasing = FALSE)
spacings.intervals <- as.numeric(quantile(spacings.observed, probs = c(seq(0,1, by = 0.05))))
spacings.expected <- (n-1)*(exp(-lambda*spacings.intervals[-length(spacings.intervals)])-exp(-lambda*spacings.intervals[-1]))
spacings.expected[length(spacings.expected)] <- n-sum(spacings.expected[1:length(spacings.expected)-1])
spacings.observed <- as.numeric(table(cut(distance.pair, breaks = spacings.intervals, include.lowest = TRUE)))
contingency_20 <- data.frame(spacings.intervals[-1],spacings.observed,spacings.expected)
contingency_20
chi_2 <- sum((spacings.observed - spacings.expected)^2/spacings.expected)
chi2_compare <- qchisq(p = 0.95, df = 18)
p_value <- pchisq(chi_2, df = 18, lower.tail = FALSE)
print(paste("The p_value when the distance is splited into 20 sub-intervals is", p_value))
[1] "The p_value when the distance is splited into 20 sub-intervals is 0.0117020696134169"
## Visualization of the Residual
Residuals <- (spacings.observed - spacings.expected) / sqrt(spacings.expected)
plot(Residuals, type = 'h', ylab = "Standardized Residuals", xlab = "Palindrome locations", main = "Plot of Standardized Residual for Locations (divided in 20 bins)")

# Consecutive Triplets
locations.sorted <- sort(locations, decreasing = FALSE)
locations.triplets <- locations.sorted[-length(locations.sorted)]
distance.triplets <- abs(locations.sorted[-1][-1]-locations.triplets[-length(locations.triplets)])
# Histogram of spacings of palindromes in original data and exponential distribution
hist(distance.triplets, breaks= 15, col = rgb(1,0,0,0.5), probability = TRUE, main = "Consecutive Triplets Spacings Distribution Comparison", xlab = "Distance between Consecutive Palindromes Locations", ylim = c(0,0.001))
lines(density(distance.triplets, adjust = 2), col = rgb(1,0,0,0.5))
Gam <- rgamma(n-2, 2, rate = 2/mean(distance.triplets))
hist(Gam, breaks = 15, col = rgb(0,0,1,0.5), probability = TRUE, add = TRUE)
lines(density(Gam, adjust = 2), col = rgb(0,0,1,0.5))
legend(x = 4200, y = 0.0005, legend = c("Sample", "Gamma"), lty = c(1,1), col = c(rgb(1,0,0,0.5), rgb(0,0,1,0.5)))

# Chi-square Goodness of Fit Test
# Construct observed numbers of intervals
# Case 1: Divided in 7 intervals
# Construct expected number of intervals
spacings.observed <- sort(distance.triplets, decreasing = FALSE)
lambda <- 2/mean(distance.pair)
spacings.intervals <- as.numeric(quantile(spacings.observed, probs = c(0,0.05, 0.1, 0.3, 0.5, 0.7, 0.9,1)))
spacings.expected <- (n-1)*(exp(-lambda*spacings.intervals[-length(spacings.intervals)])-exp(-lambda*spacings.intervals[-1]))
spacings.expected[length(spacings.expected)] <- n-sum(spacings.expected[1:length(spacings.expected)-1])
spacings.observed <- as.numeric(table(cut(distance.triplets, breaks = spacings.intervals, include.lowest = TRUE)))
contingency_7 <- data.frame(spacings.intervals[-1],spacings.observed,spacings.expected)
contingency_7
chi_2 <- sum((spacings.observed - spacings.expected)^2/spacings.expected)
chi2_compare <- qchisq(p = 0.95, df = 5)
p_value <- pchisq(chi_2, df = 5, lower.tail = FALSE)
print(paste("The p_value when the distance is splited into 7 sub-intervals is", p_value))
[1] "The p_value when the distance is splited into 7 sub-intervals is 0"
## Visualization of the Residual
Residuals <- (spacings.observed - spacings.expected) / sqrt(spacings.expected)
plot(Residuals, type = 'h', ylab = "Standardized Residuals", xlab = "Palindrome locations", main = "Plot of Standardized Residual for Locations (divided in 7 bins)")

# Case 2: Divided in 10 intervals
# Construct expected number of intervals
spacings.observed <- sort(distance.triplets, decreasing = FALSE)
spacings.intervals <- as.numeric(quantile(spacings.observed, probs = c(seq(0,1, by = 0.1))))
spacings.expected <- (n-1)*(exp(-lambda*spacings.intervals[-length(spacings.intervals)])-exp(-lambda*spacings.intervals[-1]))
spacings.expected[length(spacings.expected)] <- n-sum(spacings.expected[1:length(spacings.expected)-1])
spacings.observed <- as.numeric(table(cut(distance.triplets, breaks = spacings.intervals, include.lowest = TRUE)))
contingency_10 <- data.frame(spacings.intervals[-1],spacings.observed,spacings.expected)
contingency_10
chi_2 <- sum((spacings.observed - spacings.expected)^2/spacings.expected)
chi2_compare <- qchisq(p = 0.95, df = 8)
p_value <- pchisq(chi_2, df = 8, lower.tail = FALSE)
print(paste("The p_value when the distance is splited into 10 sub-intervals is", p_value))
[1] "The p_value when the distance is splited into 10 sub-intervals is 0"
## Visualization of the Residual
Residuals <- (spacings.observed - spacings.expected) / sqrt(spacings.expected)
plot(Residuals, type = 'h', ylab = "Standardized Residuals", xlab = "Palindrome locations", main = "Plot of Standardized Residual for Locations (divided in 10 bins)")

# Case 3: Divided in 20 intervals
# Construct expected number of intervals
spacings.observed <- sort(distance.triplets, decreasing = FALSE)
spacings.intervals <- as.numeric(quantile(spacings.observed, probs = c(seq(0,1, by = 0.05))))
spacings.expected <- (n-1)*(exp(-lambda*spacings.intervals[-length(spacings.intervals)])-exp(-lambda*spacings.intervals[-1]))
spacings.expected[length(spacings.expected)] <- n-sum(spacings.expected[1:length(spacings.expected)-1])
spacings.observed <- as.numeric(table(cut(distance.triplets, breaks = spacings.intervals, include.lowest = TRUE)))
contingency_20 <- data.frame(spacings.intervals[-1],spacings.observed,spacings.expected)
contingency_20
chi_2 <- sum((spacings.observed - spacings.expected)^2/spacings.expected)
chi2_compare <- qchisq(p = 0.95, df = 18)
p_value <- pchisq(chi_2, df = 18, lower.tail = FALSE)
print(paste("The p_value when the distance is splited into 20 sub-intervals is", p_value))
[1] "The p_value when the distance is splited into 20 sub-intervals is 0"
## Visualization of the Residual
Residuals <- (spacings.observed - spacings.expected) / sqrt(spacings.expected)
plot(Residuals, type = 'h', ylab = "Standardized Residuals", xlab = "Palindrome locations", main = "Plot of Standardized Residual for Locations (divided in 20 bins)")

Scenario 3: Counts
Use graphical methods and more formal statistical tests to examine the counts of palindromes in various regions of the DNA. Split the DNA into nonoverlapping regions of equal length to compare the number of palindomres in an interval to the number of that you would expect from uniform random scatter. The counts for shorter regions will be more variable than those for logner regions. Also, consider classifying the regions according to the number of counts.
k <- 50
tab <- table(cut(locations, breaks = seq(0, N, length.out = k+1), include.lowest = TRUE))
counts.obs <- as.vector(tab)
a <- table(cut(counts.obs, breaks = seq(min(counts.obs), max(counts.obs), length.out = k+1), include.lowest = TRUE))
# Histogram of counts of palindromes in original data and poisson distribution
hist(counts.obs, breaks = 15, col = rgb(1,0,0,0.5), probability = TRUE, main = "Counts Distribution Comparison (50 Sub-intervals)", xlab = "Number of Palindromes Sites Inside an Interval", ylim = c(0,0.2))
lines(density(counts.obs, adjust = 2), col = rgb(1,0,0,0.5))
Pois <- rpois(n, lambda = mean(counts.obs))
hist(Pois, breaks = 15, col = rgb(0,0,1,0.5), probability = TRUE, add = TRUE)
lines(density(Pois, adjust = 2), col = rgb(0,0,1,0.5))
legend(x = 12, y = 0.15, legend = c("sample", "Poisson"), lty = c(1,1), col = c(rgb(1,0,0,0.5), rgb(0,0,1,0.5)))

# Chi-Square Testing
# Construct palindrome count number
palindrome.count <- c("0,1,2", "3","4","5","6","7","8+")
# Construct observed number of intervals
intervals.observed <- c(8, 9, 13, 10, 8, 8, 5)
# Construct expected number of intervals
expected <- c()
lambda <- n/k
for (i in c(0:17)){
expect <- k* exp(-lambda)* lambda**i /factorial(i)
expected <- c(expected, expect)
}
sum <- 0
for (j in c(9:17)){
sum <- sum+expected[j]
}
intervals.expected <- c(expected[1]+expected[2]+expected[3],expected[4],expected[5],expected[6],expected[7],expected[8],sum)
# Create contingency table
b <- data.frame(palindrome.count,intervals.observed,intervals.expected)
b
chi_2 <- sum((intervals.observed - intervals.expected)^2/intervals.expected)
chi2_compare <- qchisq(p = 0.95, df = 7)
p_value <- pchisq(chi_2, df = 7, lower.tail = FALSE)
print(cat("\nWhen conducting chi_square Goodness of fit test comparing locations(divided in 500 sub-intervals) against uniform distribution\n"))
When conducting chi_square Goodness of fit test comparing locations(divided in 500 sub-intervals) against uniform distribution
NULL
print(paste("The value of chi_square statistic is", chi_2))
[1] "The value of chi_square statistic is 21.2720502400911"
print(paste("The p_value is", p_value))
[1] "The p_value is 0.00338769156571031"
---
title: "CASE STUDY 3: SEARCH FOR THE UNUSUAL CLUSTER IN THE PALINDROMES"
output: html_notebook
---



## Question
In this paper, we will search for unusual clusters of complementary palindromes. The overarching research question is: “How do we find clusters of palindromes? How do we determine whether a cluster is just a chance occurrence or a potential replication site? Based on our analysis, we will then provide recommendations to biologists who are about to start experimentally searching for the origin of replication.



## Setup
```{r}
locations <- read.table('hcmv-25kgjn1-1rfrtkc.txt', header=TRUE)$location  # Original
health <- read.csv('RAW_DATA-2iwcznn-2kr2xw0.csv', header = TRUE)  # Additional

N <- 229354  # Base pairs
n <- 296  # Palindromes
```



## Scenario 1: Random Scatter
To begin, pursue the point of view that structure in the data is indicated by departures from a uniform scatter of palindromes across the DNA.

*Of course, a random uniform scatter does that mean that palindromes will be equally spaced as milestones on a freeway. There will be some gaps on the DNA where no palindromes occur, and there will be some clumping together of palindromes.*

To look for structure examine the locations of the palindromes, the spacing between palindromes, and the counts of palindromes in non overlapping regions of the DNA. One starting place might be to see first how random scatter looks by using a computer to simulate it.

*A computer can simulate 296 palindrome sites chosen at random along a DNA sequence of 229,354 bases using a pseudo random number generator. When this is done several times, by making seller sets of simulated palindrome locations, then the real data can be compared to the simulated data.*
```{r}
set.seed(0)
color <- 'red'

# Generate 3 samples from the uniform distribution with the same size and bounds as our data.
samples = list(sort(runif(n, min=0, max=N)), sort(runif(n, min=0, max=N)), sort(runif(n, min=0, max=N)))

# Dot plot of locations of palindromes in original data and uniform scatter.
title1 <- 'Locations of Palindromes'
title2 <- c(title1,'(Simulated)')
x.axis <- 'Base Pair'
symbol <- 3
stripchart(locations, pch=symbol, col=color, main=title1, xlab=x.axis)
for (sample in samples) {
  stripchart(sample, pch=symbol, main=title2, xlab=x.axis)
}

# Additional dot plot of locations of palindromes in original data and uniform scatter.
dotchart(locations, color=color, main=title1, xlab=x.axis)
for (sample in samples) {
  dotchart(sample, main=title2, xlab=x.axis)
}

# Histogram of locations of palindromes in original data and uniform scatter.
bins <- 35
hist(locations, col=color, breaks=bins, main=title1, xlab=x.axis)
for (sample in samples) {
  hist(sample, breaks=bins, main=title2, xlab=x.axis)
}



# Scatterplot of spacing between consecutive palindromes
title1 <- 'Spacing between Consecutive Palindromes'
title2 <- c(title1,'(Simulated)')
x.axis <- 'Base Pair Location'
y.axis <- 'Distance (Base Pairs) from Previous Palindrome'
y.range <- c(0,5000)
plot(locations[-1], diff(locations), col=color, ylim=y.range, main=title1, xlab=x.axis, ylab=y.axis)
for (sample in samples) {
  plot(sample[-1], diff(sample), ylim=y.range, main=title2, xlab=x.axis, ylab=y.axis)
}



# Histogram of counts of palindromes in non-overlapping regions in original data and uniform scatter.
interval.length <- 2500
title1 <- paste('Number of Palindromes in Non-Overlapping Regions of Length', interval.length)
title2 <- c(title1,'(Simulated)')
x.axis <- 'Number of Palindromes'
bins <- seq(0,20,1)
hist(as.vector(table(cut(locations, breaks=seq(0,N,interval.length), include.lowest = TRUE))), breaks=bins, col=color, main=title1, xlab=x.axis)
for (sample in samples) {
  hist(as.vector(table(cut(sample, breaks=seq(0,N,interval.length), include.lowest = TRUE))), breaks=bins, main=title2, xlab=x.axis)
}
```



## Scenario 2: Locations and Spacings
Use graphical methods to examine the spacings between consecutive palindromes and sum of consecutive pairs, triplets, etc, spacings. Compare what you find to what you would expect to find in a random scatter. Also, use graphical methods to compare locations of the palindromes.

Locations
We performed the tests three times over three different lengths of sub-intervals.
```{r}
# Histogram of locations of palindromes in original data and uniform scatter
uniform <- sample.int(N, size = n)
hist(locations, breaks = 20, probability = TRUE, col = rgb(1,0,0,0.5), main = "Location Data Distribution Comparison", xlab= "Palindrome Locations")
lines(density(locations, adjust = 2), col = 2)
hist(uniform,breaks = 20, probability = TRUE, col = rgb(0,0,1,0.5), add = TRUE)
lines(density(uniform, adjust = 2), col = 4)
legend(x = 180000, y = 0.000008, legend = c("Sample", "Uniform"), lty = c(1,1), col = c(rgb(1,0,0,0.5), rgb(0,0,1,0.5)))

# Chi-square Goodness of Fit Test
# Case 1: k(number of sub-intervals) = 20
k <- 20
locations.expected <- n/k
tab <- table(cut(locations, breaks = seq(0, N, length.out = k+1), include.lowest = TRUE))
locations.observed <- as.vector(tab)
chi_2 <- sum((locations.observed - locations.expected)^2/locations.expected)
chi2_compare <- qchisq(p = 0.95, df = 19)
p_value <- pchisq(chi_2, df = 19, lower.tail = FALSE)
print(cat("\nWhen conducting chi_square Goodness of fit test comparing locations(divided in 20 sub-intervals) against uniform distribution\n"))
print(paste("The value of chi_square statistic is", chi_2))
print(paste("The p_value is", p_value))

## Visualization of the Residual
Residuals <- (locations.observed - locations.expected) / sqrt(locations.expected)
plot(Residuals, type = 'h', ylab = "Standardized Residuals", xlab = "Palindrome locations", main = "Plot of Standardized Residual for Locations (divided in 20 sub-intervals)")

# Case 2: k(number of sub-intervals) = 30
k <- 30
locations.expected <- n/k
tab <- table(cut(locations, breaks = seq(0, N, length.out = k+1), include.lowest = TRUE))
locations.observed <- as.vector(tab)
chi_2 <- sum((locations.observed - locations.expected)^2/locations.expected)
chi2_compare <- qchisq(p = 0.95, df = 29)
p_value <- pchisq(chi_2, df = 29, lower.tail = FALSE)
print(cat("\nWhen conducting chi_square Goodness of fit test comparing locations(divided in 30 sub-intervals) against uniform distribution\n"))
print(paste("The value of chi_square statistic is", chi_2))
print(paste("The p_value is", p_value))

## Visualization of the Residual
Residuals <- (locations.observed - locations.expected) / sqrt(locations.expected)
plot(Residuals, type = 'h', ylab = "Standardized Residuals", xlab = "Palindrome locations", main = "Plot of Standardized Residual for Locations (divided in 30 sub-intervals)")

# Case 3: k(number of sub-intervals) = 60
k <- 60
locations.expected <- n/k
tab <- table(cut(locations, breaks = seq(0, N, length.out = k+1), include.lowest = TRUE))
locations.observed <- as.vector(tab)
chi_2 <- sum((locations.observed - locations.expected)^2/locations.expected)
chi2_compare <- qchisq(p = 0.95, df = 59)
p_value <- pchisq(chi_2, df = 59, lower.tail = FALSE)
print(cat("\nWhen conducting chi_square Goodness of fit test comparing locations(divided in 60 sub-intervals) against uniform distribution\n"))
print(paste("The value of chi_square statistic is", chi_2))
print(paste("The p_value is", p_value))

## Visualization of the Residual
Residuals <- (locations.observed - locations.expected) / sqrt(locations.expected)
plot(Residuals, type = 'h', ylab = "Standardized Residuals", xlab = "Palindrome locations", main = "Plot of Standardized Residual for Locations (divided in 60 sub-intervals)")
```
Inference:
Null Hypothesis: Locations are distributed uniformly.
Conclusion: Since p-value of this chi-square test is smaller than 0.05, we fail to reject the null hypothesis. It indicates that deviations as large as ours are not so likely. Hence, we conclude that it appears that uniform is not a reasonable initial model. 
Residual Conclusion:
Since the value of the standardized residual is larger than 3, the probability model of a uniform distribution is lack of fit.

Spacings
We performed the tests three times over three different lengths of sub-intervals.
```{r}
# Consecutive Pairs
locations.sorted = sort(locations, decreasing = FALSE)
distance.pair <- abs(locations.sorted[-1]-locations.sorted[-length(locations.sorted)])

# Histogram of spacings of palindromes in original data and exponential distribution
hist(distance.pair, breaks= 15, col = rgb(1,0,0,0.5), probability = TRUE, main = "Consecutive Pairs Spacings Distribution Comparison", xlab = "Distance between Consecutive Palindromes Locations", ylim = c(0,0.001))
lines(density(distance.pair, adjust = 2), col = rgb(1,0,0,0.5))
Expo <- rexp(n-1, rate = 1/mean(distance.pair))
hist(Expo, breaks = 15, col = rgb(0,0,1,0.5), probability = TRUE, add = TRUE)
lines(density(Expo, adjust = 2), col = rgb(0,0,1,0.5))
legend(x = 4200, y = 0.0009, legend = c("Sample", "Exponential"), lty = c(1,1), col = c(rgb(1,0,0,0.5), rgb(0,0,1,0.5)))

# Chi-square Goodness of Fit Test
# Construct observed numbers of intervals

# Case 1: Divided in 7 intervals
# Construct expected number of intervals
spacings.observed <- sort(distance.pair, decreasing = FALSE)
lambda <- 1/mean(distance.pair)
spacings.intervals <- as.numeric(quantile(spacings.observed, probs = c(0,0.05, 0.1, 0.3, 0.5, 0.7, 0.9,1)))
spacings.expected <- (n-1)*(exp(-lambda*spacings.intervals[-length(spacings.intervals)])-exp(-lambda*spacings.intervals[-1]))
spacings.expected[length(spacings.expected)] <- n-sum(spacings.expected[1:length(spacings.expected)-1])
spacings.observed <- as.numeric(table(cut(distance.pair, breaks = spacings.intervals, include.lowest = TRUE)))
contingency_7 <- data.frame(spacings.intervals[-1],spacings.observed,spacings.expected)
contingency_7

chi_2 <- sum((spacings.observed - spacings.expected)^2/spacings.expected)
chi2_compare <- qchisq(p = 0.95, df = 5)
p_value <- pchisq(chi_2, df = 5, lower.tail = FALSE)
print(paste("The p_value when the distance is splited into 7 sub-intervals is", p_value))
## Visualization of the Residual
Residuals <- (spacings.observed - spacings.expected) / sqrt(spacings.expected)
plot(Residuals, type = 'h', ylab = "Standardized Residuals", xlab = "Palindrome locations", main = "Plot of Standardized Residual for Locations (divided in 7 bins)")

# Case 2: Divided in 10 intervals
# Construct expected number of intervals
spacings.observed <- sort(distance.pair, decreasing = FALSE)
spacings.intervals <- as.numeric(quantile(spacings.observed, probs = c(seq(0,1, by = 0.1))))
spacings.expected <- (n-1)*(exp(-lambda*spacings.intervals[-length(spacings.intervals)])-exp(-lambda*spacings.intervals[-1]))
spacings.expected[length(spacings.expected)] <- n-sum(spacings.expected[1:length(spacings.expected)-1])
spacings.observed <- as.numeric(table(cut(distance.pair, breaks = spacings.intervals, include.lowest = TRUE)))
contingency_10 <- data.frame(spacings.intervals[-1],spacings.observed,spacings.expected)
contingency_10

chi_2 <- sum((spacings.observed - spacings.expected)^2/spacings.expected)
chi2_compare <- qchisq(p = 0.95, df = 8)
p_value <- pchisq(chi_2, df = 8, lower.tail = FALSE)
print(paste("The p_value when the distance is splited into 10 sub-intervals is", p_value))
## Visualization of the Residual
Residuals <- (spacings.observed - spacings.expected) / sqrt(spacings.expected)
plot(Residuals, type = 'h', ylab = "Standardized Residuals", xlab = "Palindrome locations", main = "Plot of Standardized Residual for Locations (divided in 10 bins)")

# Case 3: Divided in 20 intervals
# Construct expected number of intervals
spacings.observed <- sort(distance.pair, decreasing = FALSE)
spacings.intervals <- as.numeric(quantile(spacings.observed, probs = c(seq(0,1, by = 0.05))))
spacings.expected <- (n-1)*(exp(-lambda*spacings.intervals[-length(spacings.intervals)])-exp(-lambda*spacings.intervals[-1]))
spacings.expected[length(spacings.expected)] <- n-sum(spacings.expected[1:length(spacings.expected)-1])
spacings.observed <- as.numeric(table(cut(distance.pair, breaks = spacings.intervals, include.lowest = TRUE)))
contingency_20 <- data.frame(spacings.intervals[-1],spacings.observed,spacings.expected)
contingency_20

chi_2 <- sum((spacings.observed - spacings.expected)^2/spacings.expected)
chi2_compare <- qchisq(p = 0.95, df = 18)
p_value <- pchisq(chi_2, df = 18, lower.tail = FALSE)
print(paste("The p_value when the distance is splited into 20 sub-intervals is", p_value))
## Visualization of the Residual
Residuals <- (spacings.observed - spacings.expected) / sqrt(spacings.expected)
plot(Residuals, type = 'h', ylab = "Standardized Residuals", xlab = "Palindrome locations", main = "Plot of Standardized Residual for Locations (divided in 20 bins)")
```
```{r}
# Consecutive Triplets
locations.sorted <-  sort(locations, decreasing = FALSE)
locations.triplets <- locations.sorted[-length(locations.sorted)]
distance.triplets <- abs(locations.sorted[-1][-1]-locations.triplets[-length(locations.triplets)])

# Histogram of spacings of palindromes in original data and exponential distribution
hist(distance.triplets, breaks= 15, col = rgb(1,0,0,0.5), probability = TRUE, main = "Consecutive Triplets Spacings Distribution Comparison", xlab = "Distance between Consecutive Palindromes Locations", ylim = c(0,0.001))
lines(density(distance.triplets, adjust = 2), col = rgb(1,0,0,0.5))
Gam <- rgamma(n-2, 2, rate = 2/mean(distance.triplets))
hist(Gam, breaks = 15, col = rgb(0,0,1,0.5), probability = TRUE, add = TRUE)
lines(density(Gam, adjust = 2), col = rgb(0,0,1,0.5))
legend(x = 4200, y = 0.0005, legend = c("Sample", "Gamma"), lty = c(1,1), col = c(rgb(1,0,0,0.5), rgb(0,0,1,0.5)))

# Chi-square Goodness of Fit Test
# Construct observed numbers of intervals

# Case 1: Divided in 7 intervals
# Construct expected number of intervals
spacings.observed <- sort(distance.triplets, decreasing = FALSE)
lambda <- 2/mean(distance.pair)
spacings.intervals <- as.numeric(quantile(spacings.observed, probs = c(0,0.05, 0.1, 0.3, 0.5, 0.7, 0.9,1)))
spacings.expected <- (n-1)*(exp(-lambda*spacings.intervals[-length(spacings.intervals)])-exp(-lambda*spacings.intervals[-1]))
spacings.expected[length(spacings.expected)] <- n-sum(spacings.expected[1:length(spacings.expected)-1])
spacings.observed <- as.numeric(table(cut(distance.triplets, breaks = spacings.intervals, include.lowest = TRUE)))
contingency_7 <- data.frame(spacings.intervals[-1],spacings.observed,spacings.expected)
contingency_7

chi_2 <- sum((spacings.observed - spacings.expected)^2/spacings.expected)
chi2_compare <- qchisq(p = 0.95, df = 5)
p_value <- pchisq(chi_2, df = 5, lower.tail = FALSE)
print(paste("The p_value when the distance is splited into 7 sub-intervals is", p_value))
## Visualization of the Residual
Residuals <- (spacings.observed - spacings.expected) / sqrt(spacings.expected)
plot(Residuals, type = 'h', ylab = "Standardized Residuals", xlab = "Palindrome locations", main = "Plot of Standardized Residual for Locations (divided in 7 bins)")

# Case 2: Divided in 10 intervals
# Construct expected number of intervals
spacings.observed <- sort(distance.triplets, decreasing = FALSE)
spacings.intervals <- as.numeric(quantile(spacings.observed, probs = c(seq(0,1, by = 0.1))))
spacings.expected <- (n-1)*(exp(-lambda*spacings.intervals[-length(spacings.intervals)])-exp(-lambda*spacings.intervals[-1]))
spacings.expected[length(spacings.expected)] <- n-sum(spacings.expected[1:length(spacings.expected)-1])
spacings.observed <- as.numeric(table(cut(distance.triplets, breaks = spacings.intervals, include.lowest = TRUE)))
contingency_10 <- data.frame(spacings.intervals[-1],spacings.observed,spacings.expected)
contingency_10

chi_2 <- sum((spacings.observed - spacings.expected)^2/spacings.expected)
chi2_compare <- qchisq(p = 0.95, df = 8)
p_value <- pchisq(chi_2, df = 8, lower.tail = FALSE)
print(paste("The p_value when the distance is splited into 10 sub-intervals is", p_value))
## Visualization of the Residual
Residuals <- (spacings.observed - spacings.expected) / sqrt(spacings.expected)
plot(Residuals, type = 'h', ylab = "Standardized Residuals", xlab = "Palindrome locations", main = "Plot of Standardized Residual for Locations (divided in 10 bins)")

# Case 3: Divided in 20 intervals
# Construct expected number of intervals
spacings.observed <- sort(distance.triplets, decreasing = FALSE)
spacings.intervals <- as.numeric(quantile(spacings.observed, probs = c(seq(0,1, by = 0.05))))
spacings.expected <- (n-1)*(exp(-lambda*spacings.intervals[-length(spacings.intervals)])-exp(-lambda*spacings.intervals[-1]))
spacings.expected[length(spacings.expected)] <- n-sum(spacings.expected[1:length(spacings.expected)-1])
spacings.observed <- as.numeric(table(cut(distance.triplets, breaks = spacings.intervals, include.lowest = TRUE)))
contingency_20 <- data.frame(spacings.intervals[-1],spacings.observed,spacings.expected)
contingency_20

chi_2 <- sum((spacings.observed - spacings.expected)^2/spacings.expected)
chi2_compare <- qchisq(p = 0.95, df = 18)
p_value <- pchisq(chi_2, df = 18, lower.tail = FALSE)
print(paste("The p_value when the distance is splited into 20 sub-intervals is", p_value))
## Visualization of the Residual
Residuals <- (spacings.observed - spacings.expected) / sqrt(spacings.expected)
plot(Residuals, type = 'h', ylab = "Standardized Residuals", xlab = "Palindrome locations", main = "Plot of Standardized Residual for Locations (divided in 20 bins)")

```


## Scenario 3: Counts
Use graphical methods and more formal statistical tests to examine the counts of palindromes in various regions of the DNA. Split the DNA into nonoverlapping regions of equal length to compare the number of palindomres in an interval to the number of that you would expect from uniform random scatter. The counts for shorter regions will be more variable than those for logner regions. Also, consider classifying the regions according to the number of counts.
```{r}
k <- 50
tab <- table(cut(locations, breaks = seq(0, N, length.out = k+1), include.lowest = TRUE))
counts.obs <- as.vector(tab)
a <- table(cut(counts.obs, breaks = seq(min(counts.obs), max(counts.obs), length.out = k+1), include.lowest = TRUE))

# Histogram of counts of palindromes in original data and poisson distribution
hist(counts.obs, breaks = 15, col = rgb(1,0,0,0.5), probability = TRUE, main = "Counts Distribution Comparison (50 Sub-intervals)", xlab = "Number of Palindromes Sites Inside an Interval", ylim = c(0,0.2))
lines(density(counts.obs, adjust = 2), col = rgb(1,0,0,0.5))
Pois <- rpois(n, lambda = mean(counts.obs))
hist(Pois, breaks = 15, col = rgb(0,0,1,0.5), probability = TRUE, add = TRUE)
lines(density(Pois, adjust = 2), col = rgb(0,0,1,0.5))
legend(x = 12, y = 0.15, legend = c("sample", "Poisson"), lty = c(1,1), col = c(rgb(1,0,0,0.5), rgb(0,0,1,0.5)))

# Chi-Square Testing
# Construct palindrome count number
palindrome.count <- c("0,1,2", "3","4","5","6","7","8+")
# Construct observed number of intervals
intervals.observed <- c(8, 9, 13, 10, 8, 8, 5) 
# Construct expected number of intervals
expected <- c()
lambda <- n/k
for (i in c(0:17)){
  expect <- k* exp(-lambda)* lambda**i /factorial(i)
  expected <- c(expected, expect)
}
sum <- 0
for (j in c(9:17)){
  sum <- sum+expected[j]
}
intervals.expected <- c(expected[1]+expected[2]+expected[3],expected[4],expected[5],expected[6],expected[7],expected[8],sum)

# Create contingency table
b <- data.frame(palindrome.count,intervals.observed,intervals.expected)
b

chi_2 <- sum((intervals.observed - intervals.expected)^2/intervals.expected)
chi2_compare <- qchisq(p = 0.95, df = 7)
p_value <- pchisq(chi_2, df = 7, lower.tail = FALSE)
print(cat("\nWhen conducting chi_square Goodness of fit test comparing locations(divided in 500 sub-intervals) against uniform distribution\n"))
print(paste("The value of chi_square statistic is", chi_2))
print(paste("The p_value is", p_value))

```


## Scenario 4: The Biggest Cluster
Does the interval with the greatest number of palindromes indicate a potential origin of replication? Be careful in making your intervals, for any small, but significant deviations from random scatter, such as a tight cluster of a few palindromes, could easily go undetected if the regions examined are too large. Also, if the regions are too small, a cluster of palindromes may be split between adjacent interavls and not appear as a high-count interval.
```{r}
final <- array(dim=c(500,1))
interval_length <- array(dim=c(500,1))
lamda <- array(dim=c(500,1))
for (k in 20:100){
  tab <- table(cut(locations, breaks = seq(0, N, length.out = k+1), include.lowest = TRUE))
  head(tab,10)
  tab<-as.vector(tab)
  lamda[k,] <-sum(tab)/k
  threshold <-max(tab)
  result <- 0
  interval_length[k,] <- N/k
  for (i in 0:(threshold-1)){
    result <- result+((lamda[k]^i)*exp(-lamda[k])/factorial(i))
  }
  final[k,] <- 1-result^k
}
result <- data.frame(lamda,interval_length,final)

# Display Table containing the probability of a Poisson Distribution having e greatest number of hits at least k for each sub-interval divisions
result[c(20:100),]
```


## Additional Scenario: Testing if HIV Testing positive is related to age
TODO Description
```{r}
# Clean out 'unknown' data and convert factor to numerical values
health <- transform(health, age_yrs = as.numeric(age_yrs),
                            hiv = as.character(hiv))
health.ind <- which(health$hiv != "unknown")
health <- health[health.ind,]

# Total number of people that have hiv
population = nrow(health)
pop_hiv <- nrow(health[which(health$hiv=='positive'),])

# Split the age into four groups
# 0-20
age_first <- health$age_yrs[which(health$age_yrs<21)]
age_proportion_first <- length(age_first)/population
hiv_proportion_first<- nrow(health[which((health$hiv== "positive") & (health$age_yrs <21)),])/pop_hiv

# 21-40
age_second<-health$age_yrs[which(health$age_yrs>20 & health['age_yrs']<41)]
age_proportion_second <- length(age_second)/population
hiv_proportion_second<-nrow(health[which(health$age_yrs>20 &health$age_yrs<41 & health$hiv=="positive"),])/pop_hiv

# 41-60
age_third<-health$age_yrs[which(health['age_yrs']>40 & health['age_yrs']<61)]
age_proportion_third <- length(age_third)/population
hiv_proportion_third<-nrow(health[which(health$age_yrs>40 & health$age_yrs<61 &health$hiv=="positive"),])/pop_hiv

# 61+
age_last<-health$age_yrs[which(health['age_yrs']>60)]
age_proportion_last <- length(age_last)/population
hiv_proportion_last<-nrow(health[which(health$age_yrs>60 & health$hiv=="positive"),])/pop_hiv

# Expected Data
population_dist <-c(age_proportion_first,age_proportion_second,age_proportion_third,age_proportion_last)
# Observed Data
hiv_dist<-c(hiv_proportion_first,hiv_proportion_second,hiv_proportion_third,hiv_proportion_last)

# Goodness-fittest
chi_2 <- sum((hiv_dist - population_dist)^2/population_dist)
chi2_compare <- qchisq(p = 0.95, df = 3)
p_value <- pchisq(chi_2, df = 3, lower.tail = FALSE)
print(paste("The p_value of Goodness of Fit Test is",p_value))

#Visualization
Residuals <- (hiv_dist - population_dist) / sqrt(population_dist)
plot(Residuals, type = 'h', ylab = "Standardized Residuals", xlab = "Proportion of Positive HIV", main = "Plot of Standardized Residual for Age and HIV Positive")
```


Null Hypothesis: The proportion of age in the population is unrelated with the proportion of people having hiv.(Age is not an influencing factor for HIV testing positive)
Since p-value of this chi-square goodness of fit test is close to 1, we see that deviations as large as ours (or larger) are very likely. In addition, having values of the standardized residual less than 3 suggests that it is a good fit of the age distribution to estimate the people testing positive on hiv. Hence, we reject the null hypothesis and conclude that the distribution of proportion of age matches with the the distribution of people testing positive on HIV.

